In [1]:
import pkg_resources
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
In [2]:
from deepforest.gcForest import GCForest
In [3]:
def split_x_y(dataframe, target):
"""
Little helper to split X and y from a dataframe
"""
return dataframe.drop(target, axis=1), dataframe[target]
In [4]:
raw_data = pd.read_csv(pkg_resources.resource_stream('deepforest', 'data/train.csv'))
In [5]:
raw_data.head()
Out[5]:
In [6]:
clean_data = raw_data.drop(["Cabin", "Name", "PassengerId", "Ticket"], axis=1)
clean_data = pd.get_dummies(clean_data).fillna(-1)
clean_data = pd.get_dummies(clean_data).fillna(-1)
train, test = train_test_split(clean_data)
X_train, y_train = split_x_y(train, "Survived")
X_test, y_test = split_x_y(test, "Survived")
In [7]:
def paper_like_models():
"""
As in the paper, each layer is composed of two "classic" random forest,
and two complete-random forests.
"""
models = []
for i in range(2):
models.append(RandomForestClassifier(n_estimators=1000,
n_jobs=-1,
min_samples_leaf=10))
for i in range(2):
models.append(RandomForestClassifier(n_estimators=1000,
n_jobs=-1,
max_features=1,
min_samples_leaf=10))
return models
In [8]:
def models_generator():
while True:
yield paper_like_models()
In [9]:
gcForest = GCForest(models_generator(), metric=roc_auc_score)
In [10]:
%time gcForest.grow(X_train, y_train, X_test, y_test)
In [11]:
predictions = gcForest.predict_proba(X_test)
In [12]:
roc_auc_score(y_test, predictions[:, 1])
Out[12]:
In [13]:
gcForest.levels
Out[13]:
In [ ]: